This script outlines the calculation of differentially-expressed
genes (DEGs) between current and never smokers for each of the datasets.
This is adapted from the default GEO2R script used for differential
analysis, as obtained from the GEO platform. This analysis always begins
with the series matrix rather than the raw .CEL files in these
calculations. (So, it is worth checking what preprocessing went into
each of the series matrices.) The adaptations are based on the
instructional video at this link: https://www.youtube.com/watch?v=A07xnSe7OTg The
instructor explains how to use QC plots to check the data in a stepwise
manner rather than using the default settings and trusting the output.
DEGs are displayed as volcano plots.
Each dataset follows a standard limma pipeline to compare current
vs. never smokers with the following variations: 1. log2 normalization
(if needed - used in some cases here) 2. quantile normalization (used in
all cases here) 3. vooma (used in all cases here)
To summarize the datasets, the normalization applied, and the
remaining potential issues based on QC plots: [Note: I should go back to
check for redundant samples between studies from the same groups, or
check how I dealt with that before]
GSE4302: - 44 samples - 3050 DEGs - quantile normalization - Not
clean separation in PCA - Skewed towards low FDR values
GSE7895 (note this is my reference “CS/FS/NS” dataset): - 72 samples
- 681 DEGs - quantile normalization - Not clean separation in PCA
GSE14633: - 20 samples - 247 DEGs - quantile normalization - No QC
issues that I can see
GSE19027: - 30 samples - 1832 DEGs - log2 and quantile normalization
- Not clean separation in PCA - Skewed towards low FDR values
GSE20257: - 95 samples - 2331 DEGs - log2 and quantile normalization
- PCA could suggest batch effect or confounder (2 distinct clusters not
based on smoking status) - Skewed towards low FDR values
GSE63127 (note this is my reference “CS vs NS” dataset): - 182
samples - 6270 DEGs - log2 and quantile normalization - PCA could
suggest batch effect or confounder (4 distinct clusters not based on
smoking status) - Skewed towards low FDR values
GSE4302
Load samples from GEO
# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
################################################################
# Differential expression analysis with limma
library(GEOquery)
## Loading required package: Biobase
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(limma)
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# load series and platform data from GEO
gset <- getGEO("GSE4302", GSEMatrix =TRUE, AnnotGPL=TRUE)
## Found 1 file(s)
## GSE4302_series_matrix.txt.gz
if (length(gset) > 1) idx <- grep("GPL570", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
# make proper column names to match toptable
fvarLabels(gset) <- make.names(fvarLabels(gset))
# group membership for all samples
gsms <- paste0("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
"XXXXXXXXXXXXXX100X0X000XXX000X000X0XXX001110001100",
"000001111101101110")
sml <- strsplit(gsms, split="")[[1]]
# filter out excluded samples (marked as "X")
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]
gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
Checks and normalization
## Make histograms and boxplots to check if the data is log-transformed and needs quantile normalization ##
hist(as.matrix(exprs(gset))) # Looks ok

boxplot(exprs(gset)) # Looks ok

max(exprs(gset)) # 14.29166
## [1] 14.29166
min(exprs(gset)) # 3.490193
## [1] 3.490193
# 3.49 is an odd minimum
#exprs(gset) <- normalizeBetweenArrays(exprs(gset)) # For now trying without quantile normalization
# boxplot(exprs(gset)) # Looks ok
# max(exprs(gset)) # 14.08906
# min(exprs(gset)) # 3.568253
Set up design matrix
# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("healthy_control","current_smoker"))
levels(gs) <- groups
gset$group <- gs
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)
PCA plot, and save expression data
## Plot PCA ##
colz <- as.numeric(as.factor(gs)) # Get color values from group
plotMDS(exprs(gset),
gene.selection = "common",
main = "PCA for CS vs NS GSE4302",
col = colz,
#labels = gs
pch = 1
)

# Some separation though not clean
# Saving expression data for GSE4302
GSE4302_expr_matrix <- exprs(gset) # Extract the expression matrix
GSE4302_sstat_data <- gset$group # Extract the smoking status labels
GSE4302_gene_symbol <- gset@featureData@data[["Gene.symbol"]] # Get the gene symbols to replace probeset IDs with
# Save the data
write.table(GSE4302_gene_symbol, "../2_Outputs/2_Airway_expression/GSE4302_gene_symbol.txt")
write.table(GSE4302_expr_matrix, "../2_Outputs/2_Airway_expression/GSE4302_expr_matrix_20241115.txt")
write.table(GSE4302_sstat_data, "../2_Outputs/2_Airway_expression/GSE4302_sstat_data.txt")
Vooma and fit
# calculate precision weights and show plot of mean-variance trend
v <- vooma(gset, design, plot=T)

# OR weights by group
# v <- voomaByGroup(gset, group=groups, design, plot=T, cex=0.1, pch=".", col=1:nlevels(gs))
v$genes <- fData(gset) # attach gene annotations
# fit linear model
fit <- lmFit(v)
Generate top table
# set up contrasts of interest and recalculate model coefficients
cts <- paste(groups[2], groups[1], sep="-")
cont.matrix <- makeContrasts(contrasts=cts, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
tT <- subset(tT, select=c("ID","Gene.symbol","logFC","adj.P.Val"))
# Filter unlabelled genes, duplicate genes
GSE4302_CS_NS_GEO2R_limma_all <- tT %>%
filter(Gene.symbol!= "") %>% # Remove blank gene symbols
group_by(Gene.symbol) %>%
slice_min(adj.P.Val, with_ties = TRUE) %>%
# For probesets mapping to same gene, keep one with lowest FDR. Keep ties for now to check later.
ungroup()
# Filter for FDR < 0.05
GSE4302_CS_NS_GEO2R_limma_sig <- GSE4302_CS_NS_GEO2R_limma_all %>%
filter(adj.P.Val <= 0.05) # Remove FDR > 0.05 genes
# Checking for ties
ties <- GSE4302_CS_NS_GEO2R_limma_sig %>%
group_by(Gene.symbol) %>%
filter(n() > 1) %>%
ungroup()
print(ties)
## # A tibble: 2 × 4
## ID Gene.symbol logFC adj.P.Val
## <chr> <chr> <dbl> <dbl>
## 1 201468_s_at NQO1 1.43 1.16e-10
## 2 210519_s_at NQO1 1.48 1.16e-10
# Only tie is NQO1. Both probes have s suffix. I will arbitrarily choose the one of magnitude 1.56 rather than 1.50
GSE4302_CS_NS_GEO2R_limma_sig <- GSE4302_CS_NS_GEO2R_limma_sig %>%
filter(ID != "201468_s_at")
head(GSE4302_CS_NS_GEO2R_limma_sig)
## # A tibble: 6 × 4
## ID Gene.symbol logFC adj.P.Val
## <chr> <chr> <dbl> <dbl>
## 1 1564307_a_at A2ML1 0.219 1.78e- 3
## 2 241552_at AA06 0.178 4.16e- 3
## 3 223593_at AADAT -0.517 6.20e- 4
## 4 209459_s_at ABAT -0.644 5.46e- 4
## 5 1553605_a_at ABCA13 -1.11 4.54e-11
## 6 203192_at ABCB6 0.406 1.14e- 2
nrow(GSE4302_CS_NS_GEO2R_limma_sig) # 3508 is a heckuva lot of genes for a small dataset...
## [1] 3508
Checking P value distribution and Q-Q plot
# Visualize and quality control test results.
# Build histogram of P-values for all genes. Normal test
# assumption is that most genes are not differentially expressed.
hist(GSE4302_CS_NS_GEO2R_limma_all$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
ylab = "Number of genes", main = "P-adj value distribution")

# Definitely skewed towards significant genes
hist(GSE4302_CS_NS_GEO2R_limma_sig$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
ylab = "Number of genes", main = "P-adj value distribution")

# Likewise a skew towards really significant genes within the significant genes
# summarize test results as "up", "down" or "not expressed"
dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=0)
# create Q-Q plot for t-statistic
t.good <- which(!is.na(fit2$F)) # filter out bad probes
qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")

Checking expression levels
# General expression data analysis
ex <- exprs(gset)
# expression value distribution
par(mar=c(4,4,2,1))
title <- paste ("GSE4302", "/", annotation(gset), " value distribution", sep ="")
plotDensities(ex, group=gs, main=title, legend ="topright")

Volcano plot
### Volcano plot ###
library(ggplot2)
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.3.3
library(ggpmisc)
## Warning: package 'ggpmisc' was built under R version 4.3.3
## Loading required package: ggpp
## Warning: package 'ggpp' was built under R version 4.3.3
## Registered S3 methods overwritten by 'ggpp':
## method from
## heightDetails.titleGrob ggplot2
## widthDetails.titleGrob ggplot2
##
## Attaching package: 'ggpp'
## The following object is masked from 'package:ggplot2':
##
## annotate
# Renaming generically for convenience
DEGs <- GSE4302_CS_NS_GEO2R_limma_all
DEGs_significant <- GSE4302_CS_NS_GEO2R_limma_sig
GSE4302_CS_NS_GEO2R_limma_volcano <- ggplot(DEGs, aes(x = logFC, y = -log10(adj.P.Val))) +
geom_point(aes(color = ifelse(adj.P.Val < 0.05, "red", "black")), alpha = 0.6, show.legend = FALSE) +
stat_dens2d_labels(data = DEGs_significant,
aes(label = Gene.symbol), geom = "text_repel",
size = 2,
position = position_nudge_centre(x = 0.1,
y = 0.1,
direction = "radial"),
min.segment.length = 0,
fontface = "bold") +
scale_color_manual(values = c("black", "red")) +
theme_minimal() +
labs(title = "DEGs in GSE4302 Current vs. Never Smokers",
x = "log2FC",
y = "-log10(FDR)",
caption = paste0("significant DEGs: n = ", nrow(DEGs_significant)))
GSE4302_CS_NS_GEO2R_limma_volcano
## Warning: ggrepel: 293 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Save the output
write.table(GSE4302_CS_NS_GEO2R_limma_sig, '../2_Outputs/1_Airway_DEGs/GSE4302_CS_NS_GEO2R_limma_sig_20241114.txt', sep = '\t')
GSE7895
Loading libraries and samples for differential expression
analysis
# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
################################################################
# Differential expression analysis with limma
### Modifying from GEO2R based on online tutorial: https://www.youtube.com/watch?v=A07xnSe7OTg
library(GEOquery)
library(limma)
#library(umap) # According to online tutorial, umap is mainly only useful if there are ~100 samples
library(dplyr)
# load series and platform data from GEO
gset <- getGEO("GSE7895", GSEMatrix =TRUE, AnnotGPL=TRUE)
## Found 1 file(s)
## GSE7895_series_matrix.txt.gz
if (length(gset) > 1) idx <- grep("GPL96", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
# make proper column names to match toptable
fvarLabels(gset) <- make.names(fvarLabels(gset))
# group membership for all samples
gsms <- paste0("00000000000000000000011111111111111111111111111111",
"1111111111111111111111XXXXXXXXXXXXXXXXXXXXXXXXXXXX",
"XXXX")
sml <- strsplit(gsms, split="")[[1]]
# filter out excluded samples (marked as "X")
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]
# Tutorial suggests not excluding samples, but I think you have to if they are not in the groups of interest
length(sml) # 72 samples
## [1] 72
Checks and normalizations for log2 and quantile
## Make histograms and boxplots to check if the data is log-transformed and needs quantile normalization ##
hist(as.matrix(exprs(gset))) # Values range 1-15, and 1 big peak around 3.

boxplot(exprs(gset)) # Same range of values, with similar-looking ranges, but not exactly the same

# Narrow range, therefore no log2 normalization needed
# Boxplots similar but nor identical, therefore quantile normalization needed (standard)
#exprs(gset) <- normalizeBetweenArrays(exprs(gset)) # For now we are not doing quantile normalization
# boxplot(exprs(gset)) # Now the boxplots look exactly the same
# min(exprs(gset)) # -0.2121706
# max(exprs(gset)) # 14.55543
Setting the two groups to compare, current vs never smokers, in
design matrix
# Setting the two groups to compare, current vs never smokers.
gs <- factor(sml)
groups <- make.names(c("never_smoker","current_smoker"))
levels(gs) <- groups
gset$group <- gs
# Make design matrix
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)
PCA plot to check for group separation, and saving the expression
data
## Plot PCA ##
colz <- as.numeric(as.factor(gs)) # Get color values from group
plotMDS(exprs(gset),
gene.selection = "common",
main = "PCA for CS vs NS GSE7895",
col = colz,
pch = 1
#labels = gs
)

# Separation doesn't look good here, but definitely a light split.
# Saving expression data for GSE7895
GSE7895_expr_matrix <- exprs(gset) # Extract the expression matrix
GSE7895_sstat_data <- gset$group # Extract the smoking status labels
GSE7895_gene_symbol <- gset@featureData@data[["Gene.symbol"]] # Get the gene symbols to replace probeset IDs with
# # Save the data
write.table(GSE7895_expr_matrix, "../2_Outputs/2_Airway_expression/GSE7895_expr_matrix_20241115.txt")
write.table(GSE7895_sstat_data, "../2_Outputs/2_Airway_expression/GSE7895_sstat_data.txt")
write.table(GSE7895_gene_symbol, "../2_Outputs/2_Airway_expression/GSE7895_gene_symbol.txt")
Checking the variance vs. mean value dispersion to see whether vooma
is necessary, performing vooma, then main DEG calculation:
gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
fit <- lmFit(gset, design)
#fit
## plot dispersion
plotSA(fit, main = "Fit")

# It looks very irregular, therefore use vooma.
# calculate precision weights and show plot of mean-variance trend
v <- vooma(gset, design, plot=T)

# OR weights by group
# v <- voomaByGroup(gset, group=groups, design, plot=T, cex=0.1, pch=".", col=1:nlevels(gs))
v$genes <- fData(gset) # attach gene annotations
fitV <- lmFit(v, design) # Fit based on voom
## ** Check on the "weights by group" thing
# I checked, and it appears the curves for both groups look pretty much the same?
# So I'm thinking it's not needed?
# set up contrasts of interest and recalculate model coefficients
cts <- paste(groups[2], groups[1], sep="-") # Swapped 2 and 1 to make the comparison in the correct direction
cont.matrix <- makeContrasts(contrasts=cts, levels=design)
fit2 <- contrasts.fit(fitV, cont.matrix)
# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, proportion = 0.01) # Proportion is "assumed proportion of genes which are differentially expressed"
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) # Inf shows all the significant genes
tT <- subset(tT, select=c("ID","Gene.symbol","logFC","adj.P.Val"))
head(tT)
## ID Gene.symbol logFC adj.P.Val
## 206561_s_at 206561_s_at AKR1B10 4.307787 1.011579e-19
## 202437_s_at 202437_s_at CYP1B1 5.086021 4.497583e-18
## 202831_at 202831_at GPX2 3.023927 1.269275e-15
## 201468_s_at 201468_s_at NQO1 2.105256 1.624480e-15
## 204059_s_at 204059_s_at ME1 2.379102 4.025186e-15
## 202436_s_at 202436_s_at CYP1B1 4.952136 1.028740e-14
Processing and filtering top table to genes of interest
# Now I want to filter unlabelled genes, duplicate genes, and adj.P.Val < 0.05
GSE7895_CS_NS_GEO2R_limma_all <- tT %>%
filter(Gene.symbol != "") %>% # Remove blank gene symbols
# filter(adj.P.Val <= 0.05) %>% # Remove FDR > 0.05 genes
group_by(Gene.symbol) %>%
slice_min(adj.P.Val, with_ties = TRUE) %>%
# For probesets mapping to same gene, keep one with lowest FDR. Keep ties for now to check later.
ungroup()
head(GSE7895_CS_NS_GEO2R_limma_all)
## # A tibble: 6 × 4
## ID Gene.symbol logFC adj.P.Val
## <chr> <chr> <dbl> <dbl>
## 1 220951_s_at A1CF 0.00498 0.998
## 2 217757_at A2M 0.00191 1.00
## 3 219488_at A4GALT 0.0550 0.995
## 4 221131_at A4GNT -0.0169 0.853
## 5 218075_at AAAS -0.117 0.727
## 6 218434_s_at AACS 0.591 0.00427
GSE7895_CS_NS_GEO2R_limma_sig <- GSE7895_CS_NS_GEO2R_limma_all %>%
filter(adj.P.Val <= 0.05) # Remove FDR > 0.05 genes
head(GSE7895_CS_NS_GEO2R_limma_sig)
## # A tibble: 6 × 4
## ID Gene.symbol logFC adj.P.Val
## <chr> <chr> <dbl> <dbl>
## 1 218434_s_at AACS 0.591 0.00427
## 2 203192_at ABCB6 0.890 0.00000531
## 3 202804_at ABCC1 0.610 0.00000386
## 4 203196_at ABCC4 -0.380 0.0197
## 5 205566_at ABHD2 1.13 0.000000100
## 6 200965_s_at ABLIM1 0.439 0.0400
# Checking for ties
ties <- GSE7895_CS_NS_GEO2R_limma_sig %>%
group_by(Gene.symbol) %>%
filter(n() > 1) %>%
ungroup()
print(ties)
## # A tibble: 0 × 4
## # ℹ 4 variables: ID <chr>, Gene.symbol <chr>, logFC <dbl>, adj.P.Val <dbl>
# The only gene with a tie is MUC5AC, with a log2FC of 2.42 versus 2.46.
# One of the 2 probe IDs has an 'x' suffix.
# See the description of suffixes at https://rmflight.github.io/affyMM/
# Given this, I will decide to remove the one with the 'x' suffix.
GSE7895_CS_NS_GEO2R_limma_sig <- GSE7895_CS_NS_GEO2R_limma_sig %>%
filter(ID != '214303_x_at')
head(GSE7895_CS_NS_GEO2R_limma_sig)
## # A tibble: 6 × 4
## ID Gene.symbol logFC adj.P.Val
## <chr> <chr> <dbl> <dbl>
## 1 218434_s_at AACS 0.591 0.00427
## 2 203192_at ABCB6 0.890 0.00000531
## 3 202804_at ABCC1 0.610 0.00000386
## 4 203196_at ABCC4 -0.380 0.0197
## 5 205566_at ABHD2 1.13 0.000000100
## 6 200965_s_at ABLIM1 0.439 0.0400
# **Note: Maybe I should be taking the suffixes into account in the initial filtering? As I think I did a while back? To look into.
# Save this output
# write.table(GSE7895_CS_NS_GEO2R_limma_sig, file='/Users/liambrockley/Desktop/Former_Smokers_Project/Former_Smokers_Aim_1/2_Output/GEO2R_limma_analysis/GSE7895/GSE7895_CS_NS_GEO2R_limma_sig.txt', row.names=FALSE, sep="\t")
More plots for QC (not strictly necessary, and some would need
modification)
# Visualize and quality control test results.
# Build histogram of P-values for all genes. Normal test
# assumption is that most genes are not differentially expressed.
hist(GSE7895_CS_NS_GEO2R_limma_all$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
ylab = "Number of genes", main = "P-adj value distribution")

# Looks like most genes are not significant. Good.
# create Q-Q plot for t-statistic
t.good <- which(!is.na(fit2$F)) # filter out bad probes
qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")

# summarize test results as "up", "down" or "not expressed"
dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=0)
GSE14633
Load libraries and samples
# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
################################################################
# Differential expression analysis with limma
library(GEOquery)
library(limma)
library(stringr)
# load series and platform data from GEO
gset <- getGEO("GSE14633", GSEMatrix =TRUE, AnnotGPL=FALSE)
## Found 1 file(s)
## GSE14633_series_matrix.txt.gz
if (length(gset) > 1) idx <- grep("GPL5175", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
# make proper column names to match toptable
fvarLabels(gset) <- make.names(fvarLabels(gset))
# group membership for all samples
gsms <- "1X11111111100000000X00"
sml <- strsplit(gsms, split="")[[1]]
# filter out excluded samples (marked as "X")
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]
gset <- gset[complete.cases(exprs(gset)), ]
length(sml) # 20 samples
## [1] 20
Boxplots and histograms for normalization checks
## Make histograms and boxplots to check if the data is log-transformed and needs quantile normalization ##
hist(as.matrix(exprs(gset))) # Looks OK although odd that it starts at 4?

boxplot(exprs(gset)) # Looks decent with a few genes popping out

max(exprs(gset))
## [1] 15.309
min(exprs(gset))
## [1] 4
# 4 seems like an odd minimum. It has probably been transformed somehow. Something to keep in mind if there are issues.
# Based on graphs above, I should do quantile normalization but not log2 normalization.
Quantile normalization [for now we are not doing it]
#
# ## Quantile normalization ##
# # exprs(gset) <- normalizeBetweenArrays(exprs(gset)) # quantile normalization
#
# hist(as.matrix(exprs(gset))) # still good
# boxplot(exprs(gset)) # Now identical. Why is one gene popping out there?
# min(exprs(gset)) # 4
# max(exprs(gset)) # 13.29
#
# # Again, 4 is an odd minimum.
Assign samples to groups and set up design matrix
# Assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("never_smoker","current_smoker"))
levels(gs) <- groups
gset$group <- gs
# Make design matrix
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)
Plot PCA, save expression data
## Plot PCA ##
colz <- as.numeric(as.factor(gs)) # Get color values from group
plotMDS(exprs(gset),
gene.selection = "common",
main = "PCA for CS vs NS GSE14633",
col = colz,
pch = 1
)

# Definite clean separation here!
# Saving expression data for GSE14633
GSE14633_expr_matrix <- exprs(gset) # Extract the expression matrix
GSE14633_sstat_data <- gset$group # Extract the smoking status labels
GSE14633_gene_symbol <- gset@featureData@data[["gene_assignment"]] # Get the gene symbols to replace probeset IDs with
### Extract the simple gene names from the gene_assignment column ###
#Extract simple gene symbols and put them into gene_symbols
GSE14633_gene_symbol_tmp1 <- str_match_all(GSE14633_gene_symbol, "//\\s*(.*?)\\s*//")
#Put the simple gene symbols back into their respective spots
for (i in 1:length(GSE14633_gene_symbol)){
GSE14633_gene_symbol[i] <- GSE14633_gene_symbol_tmp1[[i]][1]
}
# Remove backslashes and surrounding whitespace
for (i in 1:length(GSE14633_gene_symbol)){
GSE14633_gene_symbol[i] <- gsub("//\\s*(.*?)\\s*//", "\\1", GSE14633_gene_symbol[i])
}
# Save the data
write.table(GSE14633_expr_matrix, "../2_Outputs/2_Airway_expression/GSE14633_expr_matrix_20241115.txt")
write.table(GSE14633_sstat_data, "../2_Outputs/2_Airway_expression/GSE14633_sstat_data.txt")
write.table(GSE14633_gene_symbol, "../2_Outputs/2_Airway_expression/GSE14633_gene_symbol.txt")
Fit linear model
## Try a normal fit ##
fit <- lmFit(gset, design)
plotSA(fit, main = "Fit") ## plot dispersion

#That's a weird-looking mean-variance graph, therefore vooma is needed
# calculate precision weights and show plot of mean-variance trend
v <- vooma(gset, design, plot=T)

# OR weights by group
# v <- voomaByGroup(gset, group=groups, design, plot=T, cex=0.1, pch=".", col=1:nlevels(gs))
v$genes <- fData(gset) # attach gene annotations
# fit linear model
fit <- lmFit(v)
DEGs calculation
library(dplyr)
# set up contrasts of interest and recalculate model coefficients
cts <- c(paste(groups[2],"-",groups[1],sep="")) # Swapped 2 and 1 because this is the desired direction
cont.matrix <- makeContrasts(contrasts=cts, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
tT <- subset(tT, select=c("ID","gene_assignment","logFC","adj.P.Val"))
head(tT)
## ID
## 3748957 3748957
## 2548699 2548699
## 2962820 2962820
## 3568603 3568603
## 2676671 2676671
## 3884830 3884830
## gene_assignment
## 3748957 NM_000691 // ALDH3A1 // aldehyde dehydrogenase 3 family, member A1 // 17p11.2 // 218 /// NM_001135168 // ALDH3A1 // aldehyde dehydrogenase 3 family, member A1 // 17p11.2 // 218 /// NM_001135167 // ALDH3A1 // aldehyde dehydrogenase 3 family, member A1 // 17p11.2 // 218 /// ENST00000225740 // ALDH3A1 // aldehyde dehydrogenase 3 family, member A1 // 17p11.2 // 218 /// ENST00000457500 // ALDH3A1 // aldehyde dehydrogenase 3 family, member A1 // 17p11.2 // 218 /// ENST00000444455 // ALDH3A1 // aldehyde dehydrogenase 3 family, member A1 // 17p11.2 // 218 /// AK091272 // ALDH3A1 // aldehyde dehydrogenase 3 family, member A1 // 17p11.2 // 218 /// ENST00000379258 // ALDH3A1 // aldehyde dehydrogenase 3 family, member A1 // 17p11.2 // 218 /// ENST00000333946 // ALDH3A1 // aldehyde dehydrogenase 3 family, member A1 // 17p11.2 // 218 /// ENST00000457844 // ALDH3A1 // aldehyde dehydrogenase 3 family, member A1 // 17p11.2 // 218 /// ENST00000395555 // ALDH3A1 // aldehyde dehydrogenase 3 family, member A1 // 17p11.2 // 218 /// ENST00000439102 // ALDH3A1 // aldehyde dehydrogenase 3 family, member A1 // 17p11.2 // 218 /// ENST00000426645 // ALDH3A1 // aldehyde dehydrogenase 3 family, member A1 // 17p11.2 // 218
## 2548699 NM_000104 // CYP1B1 // cytochrome P450, family 1, subfamily B, polypeptide 1 // 2p21 // 1545 /// ENST00000260630 // CYP1B1 // cytochrome P450, family 1, subfamily B, polypeptide 1 // 2p21 // 1545 /// ENST00000407341 // CYP1B1 // cytochrome P450, family 1, subfamily B, polypeptide 1 // 2p21 // 1545 /// U03688 // CYP1B1 // cytochrome P450, family 1, subfamily B, polypeptide 1 // 2p21 // 1545
## 2962820 NM_002395 // ME1 // malic enzyme 1, NADP(+)-dependent, cytosolic // 6q12 // 4199 /// ENST00000369705 // ME1 // malic enzyme 1, NADP(+)-dependent, cytosolic // 6q12 // 4199 /// AK289783 // ME1 // malic enzyme 1, NADP(+)-dependent, cytosolic // 6q12 // 4199
## 3568603 NM_002083 // GPX2 // glutathione peroxidase 2 (gastrointestinal) // 14q24.1 // 2877 /// ENST00000389614 // GPX2 // glutathione peroxidase 2 (gastrointestinal) // 14q24.1 // 2877 /// AK225871 // GPX2 // glutathione peroxidase 2 (gastrointestinal) // 14q24.1 // 2877
## 2676671 NM_001135055 // TKT // transketolase // 3p14.3 // 7086 /// NM_001064 // TKT // transketolase // 3p14.3 // 7086 /// ENST00000462138 // TKT // transketolase // 3p14.3 // 7086 /// ENST00000423525 // TKT // transketolase // 3p14.3 // 7086 /// L12711 // TKT // transketolase // 3p14.3 // 7086 /// AK293438 // TKT // transketolase // 3p14.3 // 7086 /// U55017 // TKT // transketolase // 3p14.3 // 7086 /// BC009970 // TKT // transketolase // 3p14.3 // 7086 /// AK289454 // TKT // transketolase // 3p14.3 // 7086 /// AK303191 // TKT // transketolase // 3p14.3 // 7086 /// AK301231 // TKT // transketolase // 3p14.3 // 7086 /// BC002433 // TKT // transketolase // 3p14.3 // 7086 /// ENST00000296289 // TKT // transketolase // 3p14.3 // 7086 /// ENST00000414014 // TKT // transketolase // 3p14.3 // 7086 /// BX649193 // TKT // transketolase // 3p14.3 // 7086 /// ENST00000450814 // TKT // transketolase // 3p14.3 // 7086 /// ENST00000469678 // TKT // transketolase // 3p14.3 // 7086 /// ENST00000424680 // TKT // transketolase // 3p14.3 // 7086 /// ENST00000472528 // TKT // transketolase // 3p14.3 // 7086 /// ENST00000423516 // TKT // transketolase // 3p14.3 // 7086 /// AK309482 // TKT // transketolase // 3p14.3 // 7086
## 3884830 NM_015568 // PPP1R16B // protein phosphatase 1, regulatory (inhibitor) subunit 16B // 20q11.23 // 26051 /// NM_001172735 // PPP1R16B // protein phosphatase 1, regulatory (inhibitor) subunit 16B // 20q11.23 // 26051 /// ENST00000299824 // PPP1R16B // protein phosphatase 1, regulatory (inhibitor) subunit 16B // 20q11.23 // 26051 /// ENST00000373331 // PPP1R16B // protein phosphatase 1, regulatory (inhibitor) subunit 16B // 20q11.23 // 26051 /// BC152467 // PPP1R16B // protein phosphatase 1, regulatory (inhibitor) subunit 16B // 20q11.23 // 26051
## logFC adj.P.Val
## 3748957 2.1763 0.0001408498
## 2548699 3.9054 0.0001408498
## 2962820 1.8267 0.0001673730
## 3568603 2.0100 0.0001679948
## 2676671 1.0628 0.0001679948
## 3884830 -1.6509 0.0001679948
### Extract the simple gene names from the gene_assignment column ###
#Extract simple gene symbols and put them into gene_symbols
gene_symbols <- str_match_all(tT$gene_assignment, "//\\s*(.*?)\\s*//")
#Put the simple gene symbols back into their respective spots
for (i in 1:length(gene_symbols)){
tT$gene_assignment[i] <- gene_symbols[[i]][1]
}
# Turn the column from list into a character vector
tT$gene_assignment <- unlist(tT$gene_assignment)
# Remove backslashes and surrounding whitespace
for (i in 1:length(tT$gene_assignment)){
tT$gene_assignment[i] <- gsub("//\\s*(.*?)\\s*//", "\\1", tT$gene_assignment[i])
}
head(tT)
## ID gene_assignment logFC adj.P.Val
## 3748957 3748957 ALDH3A1 2.1763 0.0001408498
## 2548699 2548699 CYP1B1 3.9054 0.0001408498
## 2962820 2962820 ME1 1.8267 0.0001673730
## 3568603 3568603 GPX2 2.0100 0.0001679948
## 2676671 2676671 TKT 1.0628 0.0001679948
## 3884830 3884830 PPP1R16B -1.6509 0.0001679948
# Filter unlabelled genes, duplicate genes, and adj.P.Val < 0.05
GSE14633_CS_NS_GEO2R_limma_all <- tT %>%
filter(gene_assignment!= "") %>% # Remove blank gene symbols
# filter(adj.P.Val <= 0.05) %>% # Remove FDR > 0.05 genes
group_by(gene_assignment) %>%
slice_min(adj.P.Val, with_ties = TRUE) %>%
# For probesets mapping to same gene, keep one with lowest FDR. Keep ties for now to check later.
ungroup()
GSE14633_CS_NS_GEO2R_limma_sig <- GSE14633_CS_NS_GEO2R_limma_all %>%
filter(adj.P.Val <= 0.05)
# Checking for ties
ties <- GSE14633_CS_NS_GEO2R_limma_sig %>%
group_by(gene_assignment) %>%
filter(n() > 1) %>%
ungroup()
print(ties)
## # A tibble: 0 × 4
## # ℹ 4 variables: ID <int>, gene_assignment <chr>, logFC <dbl>, adj.P.Val <dbl>
# No ties
head(GSE14633_CS_NS_GEO2R_limma_sig)
## # A tibble: 6 × 4
## ID gene_assignment logFC adj.P.Val
## <int> <chr> <dbl> <dbl>
## 1 2598145 ABCA12 1.44 0.0173
## 2 2599993 ABCB6 0.859 0.000511
## 3 3649890 ABCC1 0.579 0.00918
## 4 3726691 ABCC3 0.757 0.0179
## 5 3607447 ABHD2 0.770 0.00179
## 6 3424218 ACSS3 -0.967 0.0319
## Save table
#write.table(GSE14633_CS_NS_GEO2R_limma_sig, file='../2_Outputs/1_Airway_DEGs/GSE14633_CS_NS_GEO2R_limma_sig_20241114.txt', sep="\t")
Visualize and quality control test results.
# Visualize and quality control test results.
# Build histogram of P-values for all genes. Normal test
# assumption is that most genes are not differentially expressed.
hist(GSE14633_CS_NS_GEO2R_limma_all$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
ylab = "Number of genes", main = "P-adj value distribution")

# This is probably good, insignificant genes are a vast majority.
# summarize test results as "up", "down" or "not expressed"
dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=0)
# create Q-Q plot for t-statistic
t.good <- which(!is.na(fit2$F)) # filter out bad probes
qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")

General expression data analysis
# General expression data analysis
ex <- exprs(gset)
# box-and-whisker plot
ord <- order(gs) # order samples by group
palette(c("#1B9E77", "#7570B3"))
par(mar=c(7,4,2,1))
title <- paste ("GSE14633", "/", annotation(gset), sep ="")
boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
legend("topleft", groups, fill=palette(), bty="n")

# expression value distribution
par(mar=c(4,4,2,1))
title <- paste ("GSE14633", "/", annotation(gset), " value distribution", sep ="")
plotDensities(ex, group=gs, main=title, legend ="topright")

Volcano plot
## Volcano plot ##
library(ggplot2)
library(ggrepel)
library(ggpmisc)
library(dplyr)
# Renaming generically for convenience
DEGs <- GSE14633_CS_NS_GEO2R_limma_all
DEGs_significant <- GSE14633_CS_NS_GEO2R_limma_sig
GSE14633_CS_NS_GEO2R_limma_volcano <- ggplot(DEGs, aes(x = logFC, y = -log10(adj.P.Val))) +
geom_point(aes(color = ifelse(adj.P.Val < 0.05, "red", "black")), alpha = 0.6, show.legend = FALSE) +
stat_dens2d_labels(data = DEGs_significant,
aes(label = gene_assignment), geom = "text_repel",
size = 2,
position = position_nudge_centre(x = 0.1,
y = 0.1,
direction = "radial"),
min.segment.length = 0,
fontface = "bold") +
scale_color_manual(values = c("black", "red")) +
theme_minimal() +
labs(title = "DEGs in GSE14633 Current vs. Never Smokers",
x = "log2FC",
y = "-log10(FDR)",
caption = paste0("significant DEGs: n = ", nrow(DEGs_significant)))
GSE14633_CS_NS_GEO2R_limma_volcano

GSE19027
Loading libraries and samples for differential expression
analysis
# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
################################################################
# Differential expression analysis with limma
library(GEOquery)
library(limma)
library(dplyr)
# load series and platform data from GEO
gset <- getGEO("GSE19027", GSEMatrix =TRUE, AnnotGPL=TRUE)
## Found 1 file(s)
## GSE19027_series_matrix.txt.gz
## Using locally cached version of GPL96 found here:
## /var/folders/6v/k0nvhbqx02n3jrzyml87bzth0000gn/T//Rtmp6mF7Cd/GPL96.annot.gz
if (length(gset) > 1) idx <- grep("GPL96", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
# make proper column names to match toptable
fvarLabels(gset) <- make.names(fvarLabels(gset))
# group membership for all samples
gsms <- "11000000X1XX011X11111XXXXXXXXXXXX111111XX11XXXXXX1X1010XXXXX"
sml <- strsplit(gsms, split="")[[1]]
# filter out excluded samples (marked as "X")
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]
gset <- gset[complete.cases(exprs(gset)), ] # skip missing values (I am doing this early on -- should check that's okay)
length(sml) # 30 samples
## [1] 30
Check for normalization with histogram and boxplot
# Make histograms and boxplots to check if the data is log-transformed and needs quantile normalization ##
hist(as.matrix(exprs(gset))) # Very weird: peak close to zero and ranges past 6K

boxplot(exprs(gset)) # Very weird: everything is squished close to zero

# Note that there appear to be outlier samples. Keep in mind if anything downstream seems iffy.
max(exprs(gset)) # 71142.7
## [1] 71142.7
min(exprs(gset)) # 0
## [1] 0
# Check the log2 transform
hist(log2(as.matrix(exprs(gset)))) # wow much much better

boxplot(log2(exprs(gset))) # Better but it still looks a bit off to me. The samples still differ quite a bit
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out =
## z$out[z$group == : Outlier (-Inf) in boxplot 2 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out =
## z$out[z$group == : Outlier (-Inf) in boxplot 13 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out =
## z$out[z$group == : Outlier (-Inf) in boxplot 22 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out =
## z$out[z$group == : Outlier (-Inf) in boxplot 24 is not drawn

# Keep that in mind if there are issues downstream.
# Based on graphs above, I should do log2FC and quantile normalization.
Normalization
## log2FC and quantile normalization ##
exprs(gset) <- log2(exprs(gset)+1) # log2 normalization, add +1 to avoid -Inf
#exprs(gset) <- normalizeBetweenArrays(exprs(gset)) # quantile normalization - for now not doing it
hist(as.matrix(exprs(gset))) # Much better

boxplot(exprs(gset)) # They look very similar but not exactly the same?

min(exprs(gset)) # 0
## [1] 0
max(exprs(gset)) # 16.11845
## [1] 16.11845
Assign samples to groups and set up design matrix
# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("never_smoker","current_smoker"))
levels(gs) <- groups
gset$group <- gs
Plot PCA, save expression data
colz <- as.numeric(as.factor(gs)) # Get color values from group
plotMDS(exprs(gset),
gene.selection = "common",
main = "PCA for CS vs NS GSE19027",
col = colz,
pch = 1
# labels = gs
)
legend("bottom", legend = unique(as.factor(gs)),
fill = unique(colz),
title = "")

# Some separation but not clean separation
# Maybe the outliers should be removed?
# Check whether these are outliers in hierarchical clustering
# GSM470851, GSM470852
## Making a dendrogram to see if the same outliers are found
sample_dist <- dist(t(as.matrix(exprs(gset)))) # Transpose the matrix to calculate distances between samples
hc <- hclust(sample_dist) #Perform hierarchical clustering
plot(hc, main = "Dendrogram of Samples", xlab = "", sub = "", cex = 0.8) # Plot the dendrogram

# We do see those same 2 samples clustering with the nonsmoker samples though they are smoker samples
# Potentially could remove these two samples. For now I will keep them.
# Saving expression data for GSE19027
GSE19027_expr_matrix <- exprs(gset) # Extract the expression matrix
GSE19027_sstat_data <- gset$group # Extract the smoking status labels
GSE19027_gene_symbol <- gset@featureData@data[["Gene.symbol"]] # Get the gene symbols to replace probeset IDs with
# Save the data
write.table(GSE19027_expr_matrix, "../2_Outputs/2_Airway_expression/GSE19027_expr_matrix_20241115.txt")
write.table(GSE19027_sstat_data, "../2_Outputs/2_Airway_expression/GSE19027_sstat_data.txt")
write.table(GSE19027_gene_symbol, "../2_Outputs/2_Airway_expression/GSE19027_gene_symbol.txt")
Fit check and vooma fit
# Make design matrix
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)
## Try a normal fit ##
fit <- lmFit(gset, design)
plotSA(fit, main = "Fit") ## plot dispersion

#That's a weird-looking mean-variance graph, therefore vooma is needed
## Do a vooma fit ##
# calculate precision weights and show plot of mean-variance trend
v <- vooma(gset, design, plot=T)

# OR weights by group
# v <- voomaByGroup(gset, group=groups, design, plot=T, cex=0.1, pch=".", col=1:nlevels(gs))
v$genes <- fData(gset) # attach gene annotations
# fit linear model with vooma
fit <- lmFit(v)
Main DEG calculation steps
# set up contrasts of interest and recalculate model coefficients
cts <- c(paste(groups[2],"-",groups[1],sep="")) # Swapped 2 and 1 because this is the desired direction
cont.matrix <- makeContrasts(contrasts=cts, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
tT <- subset(tT, select=c("ID","Gene.symbol","logFC","adj.P.Val"))
# Now I want to filter unlabelled genes, duplicate genes, and adj.P.Val < 0.05
GSE19027_CS_NS_GEO2R_limma_all <- tT %>%
filter(Gene.symbol != "") %>% # Remove blank gene symbols
# filter(adj.P.Val <= 0.05) %>% # Remove FDR > 0.05 genes
group_by(Gene.symbol) %>%
slice_min(adj.P.Val, with_ties = TRUE) %>%
# For probesets mapping to same gene, keep one with lowest FDR. Keep ties for now to check later.
ungroup()
# filter to FDR < 0.05
GSE19027_CS_NS_GEO2R_limma_sig <- GSE19027_CS_NS_GEO2R_limma_all %>%
filter(adj.P.Val <= 0.05) # Remove FDR > 0.05 genes
# Checking for ties
ties <- GSE19027_CS_NS_GEO2R_limma_sig %>%
group_by(Gene.symbol) %>%
filter(n() > 1) %>%
ungroup()
print(ties)
## # A tibble: 0 × 4
## # ℹ 4 variables: ID <chr>, Gene.symbol <chr>, logFC <dbl>, adj.P.Val <dbl>
# The only remaining tie is for MST1. Both probesets have the x suffix.
# The 2 values are -1.57 and -0.990
# I will arbitrarily decide to keep the probe with -1.57 to err on the side of finding relevant genes.
GSE19027_CS_NS_GEO2R_limma_sig <- GSE19027_CS_NS_GEO2R_limma_sig %>%
filter(ID != '216320_x_at')
head(GSE19027_CS_NS_GEO2R_limma_sig)
## # A tibble: 6 × 4
## ID Gene.symbol logFC adj.P.Val
## <chr> <chr> <dbl> <dbl>
## 1 217757_at A2M 0.962 0.0229
## 2 201511_at AAMP -0.476 0.0292
## 3 207225_at AANAT -0.979 0.0172
## 4 212772_s_at ABCA2 -0.711 0.00968
## 5 203192_at ABCB6 0.876 0.0440
## 6 202805_s_at ABCC1 1.09 0.0141
nrow(GSE19027_CS_NS_GEO2R_limma_sig)
## [1] 1915
# Save output
write.table(GSE19027_CS_NS_GEO2R_limma_sig, file='../2_Outputs/1_Airway_DEGs/GSE19027_CS_NS_GEO2R_limma_sig_20241114.txt', sep='\t', col.names = T)
Visualize and quality control test results.
Note that there seems to be something amiss with the FDR value
distribution.
## Visualize and quality control test results. #
hist(GSE19027_CS_NS_GEO2R_limma_all$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
ylab = "Number of genes", main = "P-adj value distribution for all genes post-filter")

# Definitely skewed disproportionately towards significant, low-FDR genes. To look into as a problem.
# create Q-Q plot for t-statistic
t.good <- which(!is.na(fit2$F)) # filter out bad probes
qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")

General expression data analysis
# General expression data analysis
ex <- exprs(gset)
# box-and-whisker plot
ord <- order(gs) # order samples by group
palette(c("#1B9E77", "#7570B3"))
par(mar=c(7,4,2,1))
title <- paste ("GSE19027", "/", annotation(gset), sep ="")
boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
legend("topleft", groups, fill=palette(), bty="n")

# expression value distribution
par(mar=c(4,4,2,1))
title <- paste ("GSE19027", "/", annotation(gset), " value distribution", sep ="")
plotDensities(ex, group=gs, main=title, legend ="topright")

Volcano plot
## Volcano plot ##
library(ggplot2)
library(ggrepel)
library(ggpmisc)
# Renaming generically for convenience
DEGs <- GSE19027_CS_NS_GEO2R_limma_all
DEGs_significant <- GSE19027_CS_NS_GEO2R_limma_sig
GSE19027_CS_NS_GEO2R_limma_volcano <- ggplot(DEGs, aes(x = logFC, y = -log10(adj.P.Val))) +
geom_point(aes(color = ifelse(adj.P.Val < 0.05, "red", "black")), alpha = 0.6, show.legend = FALSE) +
stat_dens2d_labels(data = DEGs_significant,
aes(label = Gene.symbol), geom = "text_repel",
size = 2,
position = position_nudge_centre(x = 0.1,
y = 0.1,
direction = "radial"),
min.segment.length = 0,
fontface = "bold") +
scale_color_manual(values = c("black", "red")) +
theme_minimal() +
labs(title = "DEGs in GSE19027 Current vs. Never Smokers",
x = "log2FC",
y = "-log10(FDR)",
caption = paste0("significant DEGs: n = ", nrow(DEGs_significant)))
GSE19027_CS_NS_GEO2R_limma_volcano
## Warning: ggrepel: 135 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

GSE20257
Load libraries and samples from GEO
# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
################################################################
# Differential expression analysis with limma
library(GEOquery)
library(limma)
library(dplyr)
# load series and platform data from GEO
gset <- getGEO("GSE20257", GSEMatrix =TRUE, AnnotGPL=TRUE)
## Found 1 file(s)
## GSE20257_series_matrix.txt.gz
## Using locally cached version of GPL570 found here:
## /var/folders/6v/k0nvhbqx02n3jrzyml87bzth0000gn/T//Rtmp6mF7Cd/GPL570.annot.gz
if (length(gset) > 1) idx <- grep("GPL570", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
# make proper column names to match toptable
fvarLabels(gset) <- make.names(fvarLabels(gset))
# group membership for all samples
gsms <- paste0("XXXXXXXXXXXXXXXXX1100000000XXXXXXXXXXX111111110000",
"11111XXXXXXX00000000000111111111111111111000100100",
"11XXXX1100011000011111111111X000001")
sml <- strsplit(gsms, split="")[[1]]
# filter out excluded samples (marked as "X")
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]
gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
Checks and appropriate steps for log2 and quantile
normalization
## Make histograms and boxplots to check if the data is log-transformed and needs quantile normalization ##
hist(as.matrix(exprs(gset))) # It's squished to the left, so needs log2 transform

#boxplot(exprs(gset)) # It's scary and chaotic
max(exprs(gset)) # 130257
## [1] 130257
min(exprs(gset)) # 0.0964001
## [1] 0.0964001
## log2 and quantile normalization
exprs(gset) <- log2(exprs(gset)+1) # I think even without zero values it is better to do +1 so that we don't get large negative values(?)
#exprs(gset) <- normalizeBetweenArrays(exprs(gset)) # quantile normalization - not doing for now
hist(as.matrix(exprs(gset))) # much better

boxplot(exprs(gset))

max(exprs(gset)) # 16.16458
## [1] 16.99101
min(exprs(gset)) # 0.4225966
## [1] 0.1327744
Set up design matrix
# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("non-smoker","smoker"))
levels(gs) <- groups
gset$group <- gs
# Finish setting up the design matrix.
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)
PCA plots, ComBat correction for batch, saving expression data
## Plot PCA ##
colz <- as.numeric(as.factor(gs)) # Get color values from group
PCA_1 <- plotMDS(exprs(gset),
gene.selection = "common",
main = "PCA for CS vs NS GSE20257",
col = colz,
#labels = gs
pch = 1
)

## We seem to have very clear separation between two clusters but not based on smoking status.
# This suggests some other confounding factor or batch effect.
# I could investigate the phenotypic data to try to figure out what it is, but for now I think I will just remove it with ComBat.
# The separation seems to be pretty much at zero on the PC1 axis with the exception of 3 samples, which I can manually re-assign.
# Assign batch labels based on the first dimension from MDS (equivalent to PC1)
unknown_batch_labels <- ifelse(PCA_1$x < 0, 1, 2)
# Get positions of the 3 samples that need to be reassigned
which(colnames(exprs(gset))=="GSM298232") #41
## [1] 41
which(colnames(exprs(gset))=="GSM298240") #49
## [1] 49
which(colnames(exprs(gset))=="GSM434050") #76
## [1] 76
# Manually reassign these 3 to the correct batch groups
unknown_batch_labels[41] <-1
unknown_batch_labels[49] <-1
unknown_batch_labels[76] # Seems this is already 2, which it should be
## [1] 2
# ComBat to remove batch effect
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 4.3.3
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## Loading required package: BiocParallel
exprs_matrix_combat <- ComBat(dat=exprs(gset), batch=unknown_batch_labels, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
# PCA of batch-corrected samples
PCA_2 <- plotMDS(exprs_matrix_combat,
gene.selection = "common",
main = "PCA for CS vs NS GSE20257: batch corrected",
col = colz,
#labels = gs
pch = 1
)

# Nice, now PC2 appears to be grouped based approximately on smoking status and PC1 does not appear to be in batches.
# Saving expression data for GSE20257
GSE20257_expr_matrix <- exprs(gset) # Extract the expression matrix
GSE20257_sstat_data <- gset$group # Extract the smoking status labels
GSE20257_gene_symbol <- gset@featureData@data[["Gene.symbol"]] # Get the gene symbols to replace probeset IDs with
# # Save the data
write.table(GSE20257_expr_matrix, "../2_Outputs/2_Airway_expression/GSE20257_expr_matrix_20241115.txt")
write.table(GSE20257_sstat_data, "../2_Outputs/2_Airway_expression/GSE20257_sstat_data.txt")
write.table(GSE20257_gene_symbol, "../2_Outputs/2_Airway_expression/GSE20257_gene_symbol.txt")
Vooma, fit, DEGs calculation, and filtering
## Crucial bit: Replace the expression values in gset with the batch corrected ones ##
exprs(gset) <- as.matrix(exprs_matrix_combat)
# calculate precision weights and show plot of mean-variance trend
v <- vooma(gset, design, plot=T)

# OR weights by group
# v <- voomaByGroup(gset, group=groups, design, plot=T, cex=0.1, pch=".", col=1:nlevels(gs))
v$genes <- fData(gset) # attach gene annotations
# fit linear model
fit <- lmFit(v)
# set up contrasts of interest and recalculate model coefficients
cts <- paste(groups[2], groups[1], sep="-")
cont.matrix <- makeContrasts(contrasts=cts, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
tT <- subset(tT, select=c("ID", "Gene.symbol", "logFC", "adj.P.Val"))
# Filter unlabelled genes, duplicate genes, and adj.P.Val < 0.05
GSE20257_CS_NS_GEO2R_limma_all <- tT %>%
filter(Gene.symbol!= "") %>% # Remove blank gene symbols
group_by(Gene.symbol) %>%
slice_min(adj.P.Val, with_ties = TRUE) %>%
# For probesets mapping to same gene, keep one with lowest FDR. Keep ties for now to check later.
ungroup()
GSE20257_CS_NS_GEO2R_limma_sig <- GSE20257_CS_NS_GEO2R_limma_all %>%
filter(adj.P.Val <= 0.05) # Remove FDR > 0.05 genes
# Checking for ties
ties <- GSE20257_CS_NS_GEO2R_limma_sig %>%
group_by(Gene.symbol) %>%
filter(n() > 1) %>%
ungroup()
print(ties)
## # A tibble: 2 × 4
## ID Gene.symbol logFC adj.P.Val
## <chr> <chr> <dbl> <dbl>
## 1 201311_s_at SH3BGRL -0.282 0.00182
## 2 201312_s_at SH3BGRL -0.300 0.00182
# 2 ties: AKR1C1 and NPAS3.
# Both of the probesets for AKR1C1 have the x suffix. I will arbitrarily keep the slightly larger one.
# Same suffixes for NPAS3 probesets. I will arbitrarily keep the slightly higher magnitude one.
# Removing the tied probesets
GSE20257_CS_NS_GEO2R_limma_sig <- GSE20257_CS_NS_GEO2R_limma_sig %>%
filter(ID != c('204151_x_at', '230412_at'))
head(GSE20257_CS_NS_GEO2R_limma_sig)
## # A tibble: 6 × 4
## ID Gene.symbol logFC adj.P.Val
## <chr> <chr> <dbl> <dbl>
## 1 232462_s_at A1BG-AS1 0.709 3.96e- 2
## 2 223593_at AADAT -0.797 9.37e-11
## 3 225522_at AAK1 0.333 2.74e- 5
## 4 202169_s_at AASDHPPT -0.189 3.67e- 2
## 5 1553604_at ABCA13 -0.751 3.15e- 6
## 6 223320_s_at ABCB10 -0.279 1.10e- 2
nrow(GSE20257_CS_NS_GEO2R_limma_sig)
## [1] 2424
# Save output
write.table(GSE20257_CS_NS_GEO2R_limma_sig, "../2_Outputs/1_Airway_DEGs/GSE20257_CS_NS_GEO2R_limma_sig_20241114.txt", sep = '\t', col.names = TRUE, row.names = FALSE)
QC checks: P-value distribution, Q-Q plot, Expression value
distribution
## Checking histogram of P-value distribution
hist(GSE20257_CS_NS_GEO2R_limma_all$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
ylab = "Number of genes", main = "P-adj value distribution")

# Skewed towards significant genes. Might be an issue there. To look into.
# summarize test results as "up", "down" or "not expressed"
dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=0)
# create Q-Q plot for t-statistic
t.good <- which(!is.na(fit2$F)) # filter out bad probes
qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")

################################################################
# General expression data analysis
ex <- exprs(gset)
# expression value distribution
par(mar=c(4,4,2,1))
title <- paste ("GSE20257", "/", annotation(gset), " value distribution", sep ="")
plotDensities(ex, group=gs, main=title, legend ="topright")

Volcano plot
## Volcano plot ##
library(ggplot2)
library(ggrepel)
library(ggpmisc)
# Renaming generically for convenience
DEGs <- GSE20257_CS_NS_GEO2R_limma_all
DEGs_significant <- GSE20257_CS_NS_GEO2R_limma_sig
GSE20257_CS_NS_GEO2R_limma_volcano <- ggplot(DEGs, aes(x = logFC, y = -log10(adj.P.Val))) +
geom_point(aes(color = ifelse(adj.P.Val < 0.05, "red", "black")), alpha = 0.6, show.legend = FALSE) +
stat_dens2d_labels(data = DEGs_significant,
aes(label = Gene.symbol), geom = "text_repel",
size = 2,
position = position_nudge_centre(x = 0.1,
y = 0.1,
direction = "radial"),
min.segment.length = 0,
fontface = "bold") +
scale_color_manual(values = c("black", "red")) +
theme_minimal() +
labs(title = "DEGs in GSE20257 Current vs. Never Smokers",
x = "log2FC",
y = "-log10(FDR)",
caption = paste0("significant DEGs: n = ", nrow(DEGs_significant)))
GSE20257_CS_NS_GEO2R_limma_volcano
## Warning: ggrepel: 184 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

GSE63127
Load samples from GEO
# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
################################################################
# Differential expression analysis with limma
library(GEOquery)
library(limma)
library(umap)
library(dplyr)
# load series and platform data from GEO
gset <- getGEO("GSE63127", GSEMatrix =TRUE, AnnotGPL=TRUE)
## Found 1 file(s)
## GSE63127_series_matrix.txt.gz
## Using locally cached version of GPL570 found here:
## /var/folders/6v/k0nvhbqx02n3jrzyml87bzth0000gn/T//Rtmp6mF7Cd/GPL570.annot.gz
if (length(gset) > 1) idx <- grep("GPL570", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
# make proper column names to match toptable
fvarLabels(gset) <- make.names(fvarLabels(gset))
# group membership for all samples
gsms <- paste0("X00100011111X00000000000X00000X000000000000X000X00",
"XX00X0XXXXXXXXX1111111111111111111111X111X11111111",
"XXXX1XXXXXXXXXXXXXXXXXXXXXXXX001000000010100111111",
"01110111110011111001110011101011111001110101100011",
"111111111111111111111111111111")
sml <- strsplit(gsms, split="")[[1]]
# filter out excluded samples (marked as "X")
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]
gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
length(sml) # 182 samples
## [1] 182
Normalization and checks
## Make histograms and boxplots to check if the data is log-transformed and needs quantile normalization ##
hist(as.matrix(exprs(gset))) # skewed left, needs log2 transform

boxplot(exprs(gset)) # scary-looking

max(exprs(gset)) # 136808
## [1] 136808
min(exprs(gset)) # 0.0657913
## [1] 0.0657913
# Should do both log2 and quantile normalization
## log2 and Quantile normalization ##
exprs(gset) <- log2(exprs(gset)+1)
#exprs(gset) <- normalizeBetweenArrays(exprs(gset)) # quantile normalization - skipping for now
hist(as.matrix(exprs(gset))) # much better

boxplot(exprs(gset))

# min(exprs(gset)) # 0.3185814
# max(exprs(gset)) # 16.17398
Start design matrix, plot first PCA
# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("non-smoker","smoker"))
levels(gs) <- groups
gset$group <- gs
## Plot PCA ##
colz <- as.numeric(as.factor(gs)) # Get color values from group
plotMDS(exprs(gset),
gene.selection = "common",
main = "PCA for CS vs NS GSE63127",
col = colz,
pch = 1
)
legend("left", legend = unique(as.factor(gs)),
fill = unique(colz))

## We have 4 definite clusters that are not based on smoking status.
# To look into: batch effect or other phenotypic confounders
Correction for batch effect
Setup: cleaning up the table of phenotypic information
### I want to check for sources of batch effect or bias that can explain:
## Irregular clustering for PCA and U-map
## Unusually high number of DEGs
## FDRs very skewed towards low & significant values
## It seems that one way to do this is to plot a heatmap,
## with all the pData listed, and look for clusters that naturally form.
phenotypic_data <- pData(gset) # Extract phenotypic data
# The phenotypic data is terrible.
# This is filtered down to the samples that were included.
# I will first try to clean up the phenotypic data.
head(phenotypic_data)
## title geo_accession status
## GSM190150 Small airways, non-smoker 029 GSM190150 Public on Dec 16 2008
## GSM190153 Small airways, non-smoker 036 GSM190153 Public on Jun 17 2008
## GSM254157 small airways, smoker 112 GSM254157 Public on Jun 17 2008
## GSM298223 small airways, non-smoker 050 GSM298223 Public on Jul 13 2009
## GSM298227 small airways, non-smoker 076 GSM298227 Public on Jul 13 2009
## GSM298228 small airways, non-smoker 080 GSM298228 Public on Jul 13 2009
## submission_date last_update_date type channel_count
## GSM190150 May 17 2007 Aug 28 2018 RNA 1
## GSM190153 May 17 2007 Aug 28 2018 RNA 1
## GSM254157 Jan 03 2008 Aug 28 2018 RNA 1
## GSM298223 Jun 13 2008 Nov 12 2009 RNA 1
## GSM298227 Jun 13 2008 Aug 28 2018 RNA 1
## GSM298228 Jun 13 2008 Aug 28 2018 RNA 1
## source_name_ch1
## GSM190150 airway epithelial cells obtained by bronchoscopy and brushing
## GSM190153 airway epithelial cells obtained by bronchoscopy and brushing
## GSM254157 airway epithelial cells obtained by bronchoscopy and brushing
## GSM298223 airway epithelial cells obtained by bronchoscopy and brushing
## GSM298227 airway epithelial cells obtained by bronchoscopy and brushing
## GSM298228 airway epithelial cells obtained by bronchoscopy and brushing
## organism_ch1 characteristics_ch1 characteristics_ch1.1
## GSM190150 Homo sapiens age: 34 sex: M
## GSM190153 Homo sapiens age: 45 sex: F
## GSM254157 Homo sapiens age: 45 sex: M
## GSM298223 Homo sapiens age: 38 sex: M
## GSM298227 Homo sapiens age: 29 sex: M
## GSM298228 Homo sapiens age: 39 sex: F
## characteristics_ch1.2 characteristics_ch1.3
## GSM190150 ethnic group: black smoking status: non-smoker
## GSM190153 ethnic group: hispanic smoking status: non-smoker
## GSM254157 ethnic group: white smoking status: smoker, 23 pack-years
## GSM298223 ethnic group: hispanic smoking status: non-smoker
## GSM298227 ethnic group: hispanic smoking status: non-smoker
## GSM298228 ethnic group: asian smoking status: non-smoker
## molecule_ch1
## GSM190150 total RNA
## GSM190153 total RNA
## GSM254157 total RNA
## GSM298223 total RNA
## GSM298227 total RNA
## GSM298228 total RNA
## extract_protocol_ch1
## GSM190150 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
## GSM190153 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
## GSM254157 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
## GSM298223 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
## GSM298227 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
## GSM298228 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
## label_ch1
## GSM190150 biotin
## GSM190153 biotin
## GSM254157 biotin
## GSM298223 biotin
## GSM298227 biotin
## GSM298228 biotin
## label_protocol_ch1
## GSM190150 Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 3 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
## GSM190153 Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 3 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
## GSM254157 Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 1-2 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
## GSM298223 Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 3 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
## GSM298227 Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 1-2 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
## GSM298228 Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 1-2 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
## taxid_ch1
## GSM190150 9606
## GSM190153 9606
## GSM254157 9606
## GSM298223 9606
## GSM298227 9606
## GSM298228 9606
## hyb_protocol
## GSM190150 Following fragmentation, 15 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
## GSM190153 Following fragmentation, 15 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
## GSM254157 Following fragmentation, 10 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
## GSM298223 Following fragmentation, 15 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
## GSM298227 Following fragmentation, 10 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
## GSM298228 Following fragmentation, 10 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
## scan_protocol
## GSM190150 GeneChips were scanned using the GeneChip Scanner 3000 7G.
## GSM190153 GeneChips were scanned using the GeneChip Scanner 3000 7G.
## GSM254157 GeneChips were scanned using the GeneChip Scanner 3000 7G.
## GSM298223 GeneChips were scanned using the GeneChip Scanner 3000 7G.
## GSM298227 GeneChips were scanned using the GeneChip Scanner 3000 7G.
## GSM298228 GeneChips were scanned using the GeneChip Scanner 3000 7G.
## description
## GSM190150 small airways, non-smoker
## GSM190153 small airways, non-smoker
## GSM254157 small airways, smoker 112
## GSM298223 none
## GSM298227 none
## GSM298228 none
## data_processing
## GSM190150 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
## GSM190153 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
## GSM254157 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
## GSM298223 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
## GSM298227 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
## GSM298228 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
## platform_id contact_name contact_email
## GSM190150 GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
## GSM190153 GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
## GSM254157 GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
## GSM298223 GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
## GSM298227 GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
## GSM298228 GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
## contact_laboratory contact_department
## GSM190150 Crystal Department of Genetic Medicine
## GSM190153 Crystal Department of Genetic Medicine
## GSM254157 Crystal Department of Genetic Medicine
## GSM298223 Crystal Department of Genetic Medicine
## GSM298227 Crystal Department of Genetic Medicine
## GSM298228 Crystal Department of Genetic Medicine
## contact_institute contact_address contact_city
## GSM190150 Weill Cornell Medical College 1300 York Avenue New York
## GSM190153 Weill Cornell Medical College 1300 York Avenue New York
## GSM254157 Weill Cornell Medical College 1300 York Avenue New York
## GSM298223 Weill Cornell Medical College 1300 York Avenue New York
## GSM298227 Weill Cornell Medical College 1300 York Avenue New York
## GSM298228 Weill Cornell Medical College 1300 York Avenue New York
## contact_state contact_zip/postal_code contact_country
## GSM190150 NY 10021 USA
## GSM190153 NY 10021 USA
## GSM254157 NY 10021 USA
## GSM298223 NY 10021 USA
## GSM298227 NY 10021 USA
## GSM298228 NY 10021 USA
## supplementary_file
## GSM190150 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190150/suppl/GSM190150.CEL.gz
## GSM190153 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190153/suppl/GSM190153.CEL.gz
## GSM254157 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM254nnn/GSM254157/suppl/GSM254157.CEL.gz
## GSM298223 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM298nnn/GSM298223/suppl/GSM298223.CEL.gz
## GSM298227 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM298nnn/GSM298227/suppl/GSM298227.CEL.gz
## GSM298228 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM298nnn/GSM298228/suppl/GSM298228.CEL.gz
## supplementary_file.1
## GSM190150 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190150/suppl/GSM190150.CHP.gz
## GSM190153 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190153/suppl/GSM190153.CHP.gz
## GSM254157 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM254nnn/GSM254157/suppl/GSM254157.CHP.gz
## GSM298223 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM298nnn/GSM298223/suppl/GSM298223.CHP.gz
## GSM298227 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM298nnn/GSM298227/suppl/GSM298227.CHP.gz
## GSM298228 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM298nnn/GSM298228/suppl/GSM298228.CHP.gz
## data_row_count relation relation.1
## GSM190150 54675 Reanalyzed by: GSE119087
## GSM190153 54675 Reanalyzed by: GSE60486 Reanalyzed by: GSE119087
## GSM254157 54675 Reanalyzed by: GSE60486 Reanalyzed by: GSE119087
## GSM298223 54675
## GSM298227 54675 Reanalyzed by: GSE119087
## GSM298228 54675 Reanalyzed by: GSE119087
## age:ch1 cilia length:ch1 copd status:ch1
## GSM190150 34 <NA> <NA>
## GSM190153 45 <NA> <NA>
## GSM254157 45 <NA> <NA>
## GSM298223 38 <NA> <NA>
## GSM298227 29 <NA> <NA>
## GSM298228 39 <NA> <NA>
## department of genetic medicine id:ch1 ethnic group:ch1 ethnicity:ch1
## GSM190150 <NA> black <NA>
## GSM190153 <NA> hispanic <NA>
## GSM254157 <NA> white <NA>
## GSM298223 <NA> hispanic <NA>
## GSM298227 <NA> hispanic <NA>
## GSM298228 <NA> asian <NA>
## serum 25-oh-d:ch1 sex:ch1 smoking status:ch1 group
## GSM190150 <NA> M non-smoker non.smoker
## GSM190153 <NA> F non-smoker non.smoker
## GSM254157 <NA> M smoker, 23 pack-years smoker
## GSM298223 <NA> M non-smoker non.smoker
## GSM298227 <NA> M non-smoker non.smoker
## GSM298228 <NA> F non-smoker non.smoker
# So I think the features I want to keep will be:
# Dates of submission/updates etc, sex, ethnicity, smoking status
# Keep the columns that might contain data of interest, which will need to be cleaned up.
# List of column names I want to keep and clean up into usable labels
columns_to_find <- c("geo_accession","status","submission_date","last_update_date","characteristics_ch1","characteristics_ch1.1","characteristics_ch1.2","characteristics_ch1.3","age:ch1","cilia length:ch1","ethnic group:ch1","ethnicity:ch1","serum 25-oh-d:ch1","sex:ch1","smoking status:ch1","group")
# Get the column indexes
indexes <- sapply(columns_to_find, function(col_name) which(names(phenotypic_data) == col_name))
indexes <- unlist(indexes)
phenotypic_data_v1 <- phenotypic_data[,c(indexes)]
# Now I need to parse out sex, ethnicity, smoking status, and age, vitamin D, pack years.
#Rename "group" as "smoking status"
names(phenotypic_data_v1)[16] <- "smoking_status"
## Grabbing ethnicity values from the columns ##
# Initialize a new column "ethnicity" with NA values
phenotypic_data_v1$ethnicity <- NA
# Function to find 'eth' in a row and return the corresponding value
find_ethnicity <- function(row) {
eth_column <- which(grepl('eth', row))
if (length(eth_column) > 0) {
return(row[eth_column])
} else {
return(NA)
}
}
# Apply the function row-wise to populate the "ethnicity" column
phenotypic_data_v1$ethnicity <- apply(phenotypic_data_v1, 1, find_ethnicity)
## Grabbing sex values from the columns ##
# Initialize a new column "sex" with NA values
phenotypic_data_v1$sex <- NA
# Function to find 'sex' in a row and return the corresponding value
find_sex <- function(row) {
sex_column <- which(grepl('sex', row))
if (length(sex_column) > 0) {
return(row[sex_column])
} else {
return(NA)
}
}
# Apply the function row-wise to populate the "sex" column
phenotypic_data_v1$sex <- apply(phenotypic_data_v1, 1, find_sex)
## Grabbing pack_years values from the columns ##
# Initialize a new column "pack_years" with NA values
phenotypic_data_v1$pack_years <- NA
# Function to find 'pack_years' in a row and return the corresponding value, but just the first instance
find_pack_years <- function(row) {
pack_years_column <- which(grepl('pack', row))
if (length(pack_years_column) > 0) {
return(row[pack_years_column[1]])
} else {
return(NA)
}
}
# Apply the function row-wise to populate the "pack_years" column
phenotypic_data_v1$pack_years <- apply(phenotypic_data_v1, 1, find_pack_years)
#unlist the column
phenotypic_data_v1$pack_years <- unlist(phenotypic_data_v1$pack_years )
## Grabbing age values from the columns ##
# Initialize a new column "age" with NA values
phenotypic_data_v1$age <- NA
# Function to find 'age' in a row and return the corresponding value
find_age <- function(row) {
age_column <- which(grepl('age', row))
if (length(age_column) > 0) {
return(row[age_column])
} else {
return(NA)
}
}
# Apply the function row-wise to populate the "age" column
phenotypic_data_v1$age <- apply(phenotypic_data_v1, 1, find_age)
## Grabbing vitamin_d values from the columns ##
# Initialize a new column "vitamin_d" with NA values
phenotypic_data_v1$vitamin_d <- NA
# Function to find 'vitamin_d' in a row and return the corresponding value, first instance
find_vitamin_d <- function(row) {
vitamin_d_column <- which(grepl('vitamin', row))
if (length(vitamin_d_column) > 0) {
return(row[vitamin_d_column[1]])
} else {
return(NA)
}
}
# Apply the function row-wise to populate the "vitamin_d" column
phenotypic_data_v1$vitamin_d <- apply(phenotypic_data_v1, 1, find_vitamin_d)
## Grabbing vitamin_d values from the columns ##
# Initialize a new column "vitamin_d" with NA values
phenotypic_data_v1$vitamin_d <- NA
# Function to find 'vitamin_d' in a row and return the corresponding value, first instance
find_vitamin_d <- function(row) {
vitamin_d_column <- which(grepl('vitamin', row))
if (length(vitamin_d_column) > 0) {
return(row[vitamin_d_column[1]])
} else {
return(NA)
}
}
# Apply the function row-wise to populate the "vitamin_d" column
phenotypic_data_v1$vitamin_d <- apply(phenotypic_data_v1, 1, find_vitamin_d)
## Grabbing cilia values from the columns ##
# Initialize a new column "cilia_length" with NA values
phenotypic_data_v1$cilia_length <- NA
# Function to find 'cilia' in a row and return the corresponding value, first instance
find_cilia <- function(row) {
cilia_column <- which(grepl('cilia', row))
if (length(cilia_column) > 0) {
return(row[cilia_column[1]])
} else {
return(NA)
}
}
# Apply the function row-wise to populate the "cilia" column
phenotypic_data_v1$cilia_length <- apply(phenotypic_data_v1, 1, find_cilia)
## Now cut out the messy columns
phenotypic_data_v1 <- phenotypic_data_v1[,-c(5:15)]
## Remove unnecessary prefix info
phenotypic_data_v1$ethnicity <- gsub(".*: ", "", phenotypic_data_v1$ethnicity )
phenotypic_data_v1$age <- gsub(".*: ", "", phenotypic_data_v1$age)
phenotypic_data_v1$sex <- gsub(".*: ", "", phenotypic_data_v1$sex)
phenotypic_data_v1$vitamin_d <- gsub(".*: ", "", phenotypic_data_v1$vitamin_d)
phenotypic_data_v1$cilia_length <- gsub(".*: ", "", phenotypic_data_v1$cilia_length)
phenotypic_data_v1$pack_years<- gsub(".*, ", "", phenotypic_data_v1$pack_years)
phenotypic_data_v1$pack_years<- gsub("pack-years", "", phenotypic_data_v1$pack_years)
# Reformat the submission dates to be sortable
phenotypic_data_v1 <- phenotypic_data_v1 %>%
mutate(submission_date = ifelse(submission_date == "Dec 20 2012", "2012-12-20", submission_date)) %>%
mutate(submission_date = ifelse(submission_date == "Jan 03 2008", "2008-01-08", submission_date)) %>%
mutate(submission_date = ifelse(submission_date == "Jan 31 2013", "2013-01-31", submission_date)) %>%
mutate(submission_date = ifelse(submission_date == "Jun 03 2010", "2010-06-03", submission_date)) %>%
mutate(submission_date = ifelse(submission_date == "Jun 13 2008", "2008-06-13", submission_date)) %>%
mutate(submission_date = ifelse(submission_date == "May 17 2007", "2007-05-17", submission_date)) %>%
mutate(submission_date = ifelse(submission_date == "Nov 08 2013", "2013-11-08", submission_date)) %>%
mutate(submission_date = ifelse(submission_date == "Nov 10 2014", "2014-11-10", submission_date))
# This is now usable for a heatmap or checking for batch effect, though very incomplete.
#write.table(phenotypic_data_v1, "/Users/liambrockley/Desktop/Former_Smokers_Project/Former_Smokers_Aim_1/2_Output/GEO2R_limma_analysis/GSE63127/GSE63127_phenotypic_data_v1.txt", sep = '\t', col.names = TRUE, row.names = FALSE)
First batch effect correction (submission date)
library(sva)
library(limma)
# Making a batch vector
submission_post_2010_batch <- ifelse(phenotypic_data_v1$submission_date < as.Date("2012-01-01"), 1, 2)
# Adjust the expression matrix for batch effects
exprs_matrix_combat <- ComBat(dat=exprs(gset), batch=submission_post_2010_batch, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Plot PCA for corrected version ##
date_corrected_PCA <- plotMDS(exprs_matrix_combat,
gene.selection = "common",
main = "PCA for CS vs NS GSE63127, corrected for submission date",
col = colz, # Colors smokers red and nonsmokers black
pch = pointz
#labels = gset$group
)
legend("bottom",
legend = c("Smokers", "Nonsmokers",
"2010 and Prior", "Post-2010"),
col = c("red", "black", "darkgray", "darkgray"), # Colors: only for smoking status
pch = c(15, 15, 2, 1), # Shapes: 2 = triangle, 1 = circle
pt.cex = c(1, 1, 1.5, 1.5), # Adjust size for better visibility
text.col = "black", # Text color
bty = "n") # Box type: 'n' removes border

I tried to investigate the source of the second batch effect, but did
not get a definite answer – it does not help matters that the phenotypic
information is very incomplete. I did see that the 11 samples with sex
information are divided along these lines, but this is not enought to
draw a firm conclusion. See below the PCA for sex:
PCA for sex in first batch-corrected version
plotMDS(exprs_matrix_combat,
gene.selection = "common",
main = "PCA for CS vs NS GSE63127, corrected for submission date",
col = colz, # Colors smokers red and nonsmokers black
#pch = pointz2 # Using separate shapes for all submission dates
labels = phenotypic_data_v1$sex
)
legend("top",
legend = c("Smokers", "Nonsmokers",
"M = Male", "F = Female"),
col = c("red", "black", "darkgray", "darkgray"), # Colors: only for smoking status
pch = c(15, 15, NA, NA), # Shapes: 2 = triangle, 1 = circle
pt.cex = c(1, 1, 1.5, 1.5), # Adjust size for better visibility
text.col = "black", # Text color
bty = "n")

## Samples are divided by sex, but 11/182 samples is not enough to draw a conclusion here.
As such, I corrected for the second batch effect without knowing what
causes the second batch effect.
Second batch effect correction (unknown batch effect)
# Assign batch labels based on the first dimension from MDS (equivalent to PC1)
unknown_batch_labels <- ifelse(date_corrected_PCA$x < 0, 1, 2)
# Do a second batch correction
exprs_matrix_combat_2 <- ComBat(dat=exprs_matrix_combat, batch=unknown_batch_labels, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
# View PCA plot
plotMDS(exprs_matrix_combat_2,
gene.selection = "common",
main = "PCA for CS vs NS GSE63127, corrected for submission date and second batch (possibly sex)",
col = colz, # Colors smokers red and nonsmokers black
pch = pointz
#labels = gset$group
)
legend("topleft",
legend = c("Smokers", "Nonsmokers",
"2010 and Prior", "Post-2010"),
col = c("red", "black", "darkgray", "darkgray"), # Colors: only for smoking status
pch = c(15, 15, 2, 1), # Shapes: 2 = triangle, 1 = circle
pt.cex = c(1, 1, 1.5, 1.5), # Adjust size for better visibility
text.col = "black", # Text color
bty = "n") # Box type: 'n' removes border

## Now samples seem roughly divided along PC1 for smoking status
Saving the expression data
#Saving expression data for GSE63127
GSE63127_expr_matrix <- as.data.frame(exprs_matrix_combat_2) # Extract the expression matrix
GSE63127_sstat_data <- gset$group # Extract the smoking status labels
GSE63127_gene_symbol <- gset@featureData@data[["Gene.symbol"]] # Get the gene symbols to replace probeset IDs with
# Save the data
write.table(GSE63127_expr_matrix, "../2_Outputs/2_Airway_expression/GSE63127_expr_matrix_20241118.txt")
write.table(GSE63127_sstat_data, "../2_Outputs/2_Airway_expression/GSE63127_sstat_data.txt")
write.table(GSE63127_gene_symbol, "../2_Outputs/2_Airway_expression/GSE63127_gene_symbol.txt")
vooma, lmfit, and DEGs calculation
# Finish setting up the design matrix
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)
## Crucial bit: Replace the expression values in gset with the batch corrected ones ##
exprs(gset) <- as.matrix(exprs_matrix_combat_2)
# calculate precision weights and show plot of mean-variance trend
v <- vooma(gset, design, plot=T)

# OR weights by group
# v <- voomaByGroup(gset, group=groups, design, plot=T, cex=0.1, pch=".", col=1:nlevels(gs))
v$genes <- fData(gset) # attach gene annotations
# fit linear model
fit <- lmFit(v)
# set up contrasts of interest and recalculate model coefficients
cts <- paste(groups[2], groups[1], sep="-")
cont.matrix <- makeContrasts(contrasts=cts, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
tT <- subset(tT, select=c("ID","Gene.symbol","logFC","adj.P.Val"))
# Filter unlabelled genes, duplicate genes
GSE63127_CS_NS_GEO2R_limma_all <- tT %>%
filter(Gene.symbol!= "") %>% # Remove blank gene symbols
group_by(Gene.symbol) %>%
slice_min(adj.P.Val, with_ties = TRUE) %>%
# For probesets mapping to same gene, keep one with lowest FDR. Keep ties for now to check later.
ungroup()
# Filter for FDR < 0.05
GSE63127_CS_NS_GEO2R_limma_sig <- GSE63127_CS_NS_GEO2R_limma_all %>%
filter(adj.P.Val <= 0.05) # Remove FDR > 0.05 genes
# Checking for ties
ties <- GSE63127_CS_NS_GEO2R_limma_sig %>%
group_by(Gene.symbol) %>%
filter(n() > 1) %>%
ungroup()
print(ties)
## # A tibble: 0 × 4
## # ℹ 4 variables: ID <chr>, Gene.symbol <chr>, logFC <dbl>, adj.P.Val <dbl>
# No ties
head(GSE63127_CS_NS_GEO2R_limma_sig)
## # A tibble: 6 × 4
## ID Gene.symbol logFC adj.P.Val
## <chr> <chr> <dbl> <dbl>
## 1 232462_s_at A1BG-AS1 0.531 2.24e- 2
## 2 218434_s_at AACS 0.128 2.82e- 3
## 3 223593_at AADAT -0.614 9.30e-19
## 4 202852_s_at AAGAB 0.179 3.13e- 3
## 5 225522_at AAK1 0.344 3.86e-15
## 6 220268_at AAMDC 0.657 1.81e- 2
nrow(GSE63127_CS_NS_GEO2R_limma_sig)
## [1] 7105
# Save the output (used absolute path for convenience)
#write.table(GSE63127_CS_NS_GEO2R_limma_sig, "../2_Outputs/1_Airway_DEGs/GSE63127_CS_NS_GEO2R_limma_sig_20241115.txt", sep = '\t', col.names = TRUE, row.names = FALSE)
P-value distribution and Q-Q plot
# Visualize and quality control test results.
# Build histogram of P-values for all genes. Normal test
# assumption is that most genes are not differentially expressed.
hist(GSE63127_CS_NS_GEO2R_limma_all$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
ylab = "Number of genes", main = "P-adj value distribution")

## This looks very skewed towards significant genes. An issue. Look into this.
## Now checking pre-processed distribution
tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
ylab = "Number of genes", main = "P-adj value distribution")

# Same issue
# summarize test results as "up", "down" or "not expressed"
dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=0)
# create Q-Q plot for t-statistic
t.good <- which(!is.na(fit2$F)) # filter out bad probes
qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")

Expression data analysis for checking, and UMAP as well (since large
sample size)
# General expression data analysis
ex <- exprs(gset)
# # box-and-whisker plot
# ord <- order(gs) # order samples by group
# palette(c("#1B9E77", "#7570B3"))
# par(mar=c(7,4,2,1))
# title <- paste ("GSE63127", "/", annotation(gset), sep ="")
# boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
# legend("topleft", groups, fill=palette(), bty="n")
# expression value distribution
par(mar=c(4,4,2,1))
title <- paste ("GSE63127", "/", annotation(gset), " value distribution", sep ="")
plotDensities(ex, group=gs, main=title, legend ="topright")

# UMAP plot (dimensionality reduction)
## Since there are >100 samples (182 samples) included, I think it is valid to check UMAP
ex <- na.omit(ex) # eliminate rows with NAs
ex <- ex[!duplicated(ex), ] # remove duplicates
ump <- umap(t(ex), n_neighbors = 15, random_state = 123)
par(mar=c(3,3,2,6), xpd=TRUE)
plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5)
legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20,
col=1:nlevels(gs), title="Group", pt.cex=1.5)

#library("maptools") # point labels without overlaps
#pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
## 2 clusters that are somewhat mixed. Could still indicate some kind of confounding/batch.
Volcano plots
## Volcano plot##
library(ggplot2)
library(ggrepel)
library(ggpmisc)
# Renaming generically for convenience
DEGs <- GSE63127_CS_NS_GEO2R_limma_all
DEGs_significant <- GSE63127_CS_NS_GEO2R_limma_sig
GSE63127_CS_NS_GEO2R_limma_volcano <- ggplot(DEGs, aes(x = logFC, y = -log10(adj.P.Val))) +
geom_point(aes(color = ifelse(adj.P.Val < 0.05, "red", "black")), alpha = 0.6, show.legend = FALSE) +
stat_dens2d_labels(data = DEGs_significant,
aes(label = Gene.symbol), geom = "text_repel",
size = 2,
position = position_nudge_centre(x = 0.1,
y = 0.1,
direction = "radial"),
min.segment.length = 0,
fontface = "bold") +
scale_color_manual(values = c("black", "red")) +
theme_minimal() +
labs(title = "DEGs in GSE63127 Current vs. Never Smokers",
x = "log2FC",
y = "-log10(FDR)",
caption = paste0("significant DEGs: n = ", nrow(DEGs_significant)))
GSE63127_CS_NS_GEO2R_limma_volcano
## Warning: ggrepel: 657 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
